notebook.community

Edit and run



In [ ]:

    
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 200



In [ ]:

    
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.model_selection import train_test_split
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)



In [ ]:

    
print(boston.DESCR)



In [ ]:

    
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")



In [ ]:

    
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right")
plt.ylabel("MEDV")



In [ ]:

    
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)



In [ ]:

    
from sklearn.model_selection import cross_val_score



In [ ]:

    
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),
                         X_train, y_train, cv=10)
np.mean(scores), np.std(scores)



In [ ]:

    
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),
                         X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

Categorical Variables



In [ ]:

    
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})
df



In [ ]:

    
pd.get_dummies(df)



In [ ]:

    
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': [0, 1, 0, 2, 2, 3]})
df



In [ ]:

    
pd.get_dummies(df, columns=['boro'])

Exercise

Apply dummy encoding and scaling to the "adult" dataset consisting of income data from the census.

Bonus: visualize the data.



In [ ]:

    
data = pd.read_csv("adult.csv", index_col=0)



In [ ]:

    
# %load solutions/load_adult.py